This is a case study of Google Analyst Certification program in Coursera. Following analysis mainly follow the guides, fictional scenario, and this script of the case study document. Data sources are from Motivate International Inc. under this license. You can reference the data in this link.
Cyclistic is a fictional bike-share company in Chicago. Cyclistic’s finance analysts have concluded that annual members are much more profitable than casual riders. Cyclistic decide to design marketing strategies aim at converting casual riders into annual members. In order to do that, the marketing analyst team analyze Cyclistic’s historical trip data to better understand following 3 questions :
This report introduce the findings of data analysis for the first question above.
# for wrangle , visualize data
#install.packages("tidyverse") ; install.packages("ggplot2")
library(tidyverse) ; library(lubridate) ; library(ggplot2) ; library(scales)
# For Chinese text in ggplot
#install.packages("showtext")
library(showtext) ; showtext_auto()
# for Google map
#install.packages("ggmap")
library(ggmap) ; library(RgoogleMaps)
# for other Map libraries
#install.packages("maps") ; install.packages("mapproj") ; install.packages("sf") ; install.packages("mapview") ; install.packages("RColorBrewer")
library(maps) ; library(mapproj) ; library(sf) ; library(mapview) ; library(RColorBrewer)
Download data file from https://divvy-tripdata.s3.amazonaws.com/index.html then to unzip and upload to working directory ( data/csv ).
Available data are from year 2013 to 2021-04 at the time doing this analysis. Data files are continuously updated in that website, . After exploring some period of data files, following issues are found:
Without available information to clarify issues above and in order to do yearly comparison, this analysis is based on data period from 2017-01 to 2020-12. Consistency issues of column names and contents are addressed in following Cleaning Data step.
data_files <- list.files(pattern = "^Divvy_Trips_2017_Q[1-4].csv") # Identify file names
data_files
## [1] "Divvy_Trips_2017_Q1.csv" "Divvy_Trips_2017_Q2.csv"
## [3] "Divvy_Trips_2017_Q3.csv" "Divvy_Trips_2017_Q4.csv"
trip_2017 = data.frame(matrix(ncol=0,nrow=0))
for(i in 1:length(data_files)) {
trip_data <- read_csv(data_files[i],col_types="ccccnccccccn") %>%
mutate(start_time = parse_date_time(start_time, c("%m/%d/%Y %H:%M:%S","%m/%d/%Y %H:%M") ),
end_time = parse_date_time(end_time, c("%m/%d/%Y %H:%M:%S","%m/%d/%Y %H:%M") ) )
trip_2017 <- bind_rows(trip_2017, trip_data)
}
trip_2017 <- rename(trip_2017
,ride_id = trip_id
,rideable_type = bikeid
,started_at = start_time
,ended_at = end_time
,start_station_name = from_station_name
,start_station_id = from_station_id
,end_station_name = to_station_name
,end_station_id = to_station_id
,member_casual = usertype)
trip_2018_q1 <- read_csv("Divvy_Trips_2018_Q1.csv",col_types="cTTcnccccccn")
trip_2018_q1 <- rename(trip_2018_q1
,ride_id = "01 - Rental Details Rental ID"
,rideable_type = "01 - Rental Details Bike ID"
,started_at = "01 - Rental Details Local Start Time"
,ended_at = "01 - Rental Details Local End Time"
,start_station_name = "03 - Rental Start Station Name"
,start_station_id = "03 - Rental Start Station ID"
,end_station_name = "02 - Rental End Station Name"
,end_station_id = "02 - Rental End Station ID"
,member_casual = "User Type")
data_files <- list.files(pattern = "^Divvy_Trips_2018_Q[2-4].csv") # Identify file names
data_files
trip_2018_q234 = data.frame(matrix(ncol=0,nrow=0))
for(i in 1:length(data_files)) {
trip_data <- read_csv(data_files[i],col_types="cTTcnccccccn")
trip_2018_q234 <- bind_rows(trip_2018_q234, trip_data)
}
trip_2018_q234 <- rename(trip_2018_q234
,ride_id = trip_id
,rideable_type = bikeid
,started_at = start_time
,ended_at = end_time
,start_station_name = from_station_name
,start_station_id = from_station_id
,end_station_name = to_station_name
,end_station_id = to_station_id
,member_casual = usertype)
trip_2018 <- bind_rows(trip_2018_q1, trip_2018_q234)
data_files <- list.files(pattern = "^Divvy_Trips_2019_Q[1,3,4].csv") # Identify file names
data_files
trip_2019_q134 = data.frame(matrix(ncol=0,nrow=0))
for(i in 1:length(data_files)) {
trip_data <- read_csv(data_files[i],col_types="dTTdndcdcccd")
trip_2019_q134 <- bind_rows(trip_2019_q134, trip_data)
}
trip_2019_q134 <- rename(trip_2019_q134
,ride_id = trip_id
,rideable_type = bikeid
,started_at = start_time
,ended_at = end_time
,start_station_name = from_station_name
,start_station_id = from_station_id
,end_station_name = to_station_name
,end_station_id = to_station_id
,member_casual = usertype)
trip_2019_q2 <- read_csv("Divvy_Trips_2019_Q2.csv",col_types="dTTdndcdcccd")
trip_2019_q2 <- rename(trip_2019_q2
,ride_id = "01 - Rental Details Rental ID"
,rideable_type = "01 - Rental Details Bike ID"
,started_at = "01 - Rental Details Local Start Time"
,ended_at = "01 - Rental Details Local End Time"
,start_station_name = "03 - Rental Start Station Name"
,start_station_id = "03 - Rental Start Station ID"
,end_station_name = "02 - Rental End Station Name"
,end_station_id = "02 - Rental End Station ID"
,member_casual = "User Type")
trip_2019 <- bind_rows(trip_2019_q134,trip_2019_q2)
# Convert ride_id and rideable_type to character so that they can stack correctly
trip_2019 <- mutate(trip_2019, ride_id = as.character(ride_id)
,rideable_type = as.character(rideable_type)
,start_station_id = as.character(start_station_id)
,end_station_id = as.character(end_station_id))
Use for loop load data filename bigger than yeqr 2020 then filter out 2021 data.
trip_2020_q1 <- read_csv("Divvy_Trips_2020_Q1.csv",col_types="ccTTccccnnnnc")
data_files <- list.files(pattern = "^202[0,1][0-9]{2}-divvy-tripdata.csv") # Identify file names
data_files
trip_after_2020_q1 = data.frame(matrix(ncol=0,nrow=0))
for(i in 1:length(data_files)) {
trip_data <- read_csv(data_files[i],col_types="ccTTccccnnnnc")
trip_after_2020_q1 <- bind_rows(trip_after_2020_q1, trip_data)
}
trip_from_2020 <- bind_rows(trip_2020_q1,trip_after_2020_q1)
trip_2020 <- trip_from_2020 %>%
filter( year(started_at) == 2020 )
all_trips <- bind_rows(trip_2017,trip_2018,trip_2019,trip_2020)
# Remove lat, long, birthyear, and gender fields as this data was dropped beginning in 2020
all_trips <- all_trips %>%
select(-c( birthyear, gender, "01 - Rental Details Duration In Seconds Uncapped", "05 - Member Details Member Birthday Year", "Member Gender", "tripduration"))
colnames(all_trips) #List of column names
## [1] "ride_id" "started_at" "ended_at"
## [4] "rideable_type" "start_station_id" "start_station_name"
## [7] "end_station_id" "end_station_name" "member_casual"
## [10] "start_lat" "start_lng" "end_lat"
## [13] "end_lng"
nrow(all_trips) #How many rows are in data frame?
## [1] 14791783
dim(all_trips) #Dimensions of the data frame?
## [1] 14791783 13
head(all_trips,3) #See the first 6 rows of data frame. Also tail(qs_raw)
## ride_id started_at ended_at rideable_type
## 1 13518905 2017-03-31 23:59:07 2017-04-01 00:13:24 5292
## 2 13518904 2017-03-31 23:56:25 2017-04-01 00:00:21 4408
## 3 13518903 2017-03-31 23:55:33 2017-04-01 00:01:21 696
## start_station_id start_station_name end_station_id
## 1 66 Clinton St & Lake St 171
## 2 199 Wabash Ave & Grand Ave 26
## 3 520 Greenview Ave & Jarvis Ave 432
## end_station_name member_casual start_lat start_lng end_lat end_lng
## 1 May St & Cullerton St Subscriber NA NA NA NA
## 2 McClurg Ct & Illinois St Subscriber NA NA NA NA
## 3 Clark St & Lunt Ave Subscriber NA NA NA NA
str(all_trips) #See list of columns and data types (numeric, character, etc)
## 'data.frame': 14791783 obs. of 13 variables:
## $ ride_id : chr "13518905" "13518904" "13518903" "13518902" ...
## $ started_at : POSIXct, format: "2017-03-31 23:59:07" "2017-03-31 23:56:25" ...
## $ ended_at : POSIXct, format: "2017-04-01 00:13:24" "2017-04-01 00:00:21" ...
## $ rideable_type : chr "5292" "4408" "696" "4915" ...
## $ start_station_id : chr "66" "199" "520" "110" ...
## $ start_station_name: chr "Clinton St & Lake St" "Wabash Ave & Grand Ave" "Greenview Ave & Jarvis Ave" "Dearborn St & Erie St" ...
## $ end_station_id : chr "171" "26" "432" "142" ...
## $ end_station_name : chr "May St & Cullerton St" "McClurg Ct & Illinois St" "Clark St & Lunt Ave" "McClurg Ct & Erie St" ...
## $ member_casual : chr "Subscriber" "Subscriber" "Subscriber" "Subscriber" ...
## $ start_lat : num NA NA NA NA NA NA NA NA NA NA ...
## $ start_lng : num NA NA NA NA NA NA NA NA NA NA ...
## $ end_lat : num NA NA NA NA NA NA NA NA NA NA ...
## $ end_lng : num NA NA NA NA NA NA NA NA NA NA ...
summary(all_trips) #Statistical summary of data. Mainly for numerics
## ride_id started_at ended_at
## Length:14791783 Min. :2017-01-01 00:00:36 Min. :2017-01-01 00:06:32
## Class :character 1st Qu.:2017-11-30 12:04:00 1st Qu.:2017-11-30 12:16:00
## Mode :character Median :2018-12-19 09:52:01 Median :2018-12-19 10:09:31
## Mean :2019-01-11 02:20:28 Mean :2019-01-11 02:42:31
## 3rd Qu.:2019-11-30 09:55:12 3rd Qu.:2019-11-30 10:23:33
## Max. :2020-12-31 23:59:59 Max. :2021-01-03 08:54:11
##
## rideable_type start_station_id start_station_name end_station_id
## Length:14791783 Length:14791783 Length:14791783 Length:14791783
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## end_station_name member_casual start_lat start_lng
## Length:14791783 Length:14791783 Min. :42 Min. :-88
## Class :character Class :character 1st Qu.:42 1st Qu.:-88
## Mode :character Mode :character Median :42 Median :-88
## Mean :42 Mean :-88
## 3rd Qu.:42 3rd Qu.:-88
## Max. :42 Max. :-88
## NA's :11250100 NA's :11250100
## end_lat end_lng
## Min. :42 Min. :-88
## 1st Qu.:42 1st Qu.:-88
## Median :42 Median :-88
## Mean :42 Mean :-88
## 3rd Qu.:42 3rd Qu.:-88
## Max. :42 Max. :-87
## NA's :11254355 NA's :11254355
table(all_trips$member_casual)
##
## casual Customer Dependent member Subscriber
## 1366575 2394665 7 2175108 8855428
There are 5 customer type values in above table. In this analysis define only 2 type :
all_trips <- all_trips %>%
mutate(member_casual = recode(member_casual
,"Subscriber" = "member"
,"Customer" = "casual"
,"Dependent" = "casual"))
# Check to make sure the proper number of observations were reassigned
table(all_trips$member_casual)
##
## casual member
## 3761247 11030536
This will allow us to aggregate ride data for each month, day, or year … before completing these operations we could only aggregate at the ride level https://www.statmethods.net/input/dates.html more on date formats in R found at that link
all_trips$date <- as.Date(all_trips$started_at) #The default format is yyyy-mm-dd
all_trips$month <- as.numeric( format(as.Date(all_trips$date), "%m") )
all_trips$day <- as.numeric( format(as.Date(all_trips$date), "%d") )
all_trips$hour <- as.numeric( format(as.Date(all_trips$date), "%H") )
all_trips$year <- as.numeric( format(as.Date(all_trips$date), "%Y") )
# Let day of week label in english
Sys.setlocale("LC_TIME", "en_US")
## [1] "en_US"
all_trips$day_of_week <- format(as.Date(all_trips$date), "%A")
# https://stat.ethz.ch/R-manual/R-devel/library/base/html/difftime.html
all_trips$ride_length <- difftime(all_trips$ended_at,all_trips$started_at, units = "mins")
# convert ride_length to numeric
all_trips$ride_length <- as.numeric(as.character(all_trips$ride_length))
is.numeric(all_trips$ride_length)
## [1] TRUE
str(all_trips)
## 'data.frame': 14791783 obs. of 20 variables:
## $ ride_id : chr "13518905" "13518904" "13518903" "13518902" ...
## $ started_at : POSIXct, format: "2017-03-31 23:59:07" "2017-03-31 23:56:25" ...
## $ ended_at : POSIXct, format: "2017-04-01 00:13:24" "2017-04-01 00:00:21" ...
## $ rideable_type : chr "5292" "4408" "696" "4915" ...
## $ start_station_id : chr "66" "199" "520" "110" ...
## $ start_station_name: chr "Clinton St & Lake St" "Wabash Ave & Grand Ave" "Greenview Ave & Jarvis Ave" "Dearborn St & Erie St" ...
## $ end_station_id : chr "171" "26" "432" "142" ...
## $ end_station_name : chr "May St & Cullerton St" "McClurg Ct & Illinois St" "Clark St & Lunt Ave" "McClurg Ct & Erie St" ...
## $ member_casual : chr "member" "member" "member" "member" ...
## $ start_lat : num NA NA NA NA NA NA NA NA NA NA ...
## $ start_lng : num NA NA NA NA NA NA NA NA NA NA ...
## $ end_lat : num NA NA NA NA NA NA NA NA NA NA ...
## $ end_lng : num NA NA NA NA NA NA NA NA NA NA ...
## $ date : Date, format: "2017-03-31" "2017-03-31" ...
## $ month : num 3 3 3 3 3 3 3 3 3 3 ...
## $ day : num 31 31 31 31 31 31 31 31 31 31 ...
## $ hour : num 0 0 0 0 0 0 0 0 0 0 ...
## $ year : num 2017 2017 2017 2017 2017 ...
## $ day_of_week : chr "Friday" "Friday" "Friday" "Friday" ...
## $ ride_length : num 14.28 3.93 5.8 4.8 6.92 ...
# Remove "bad" data
# The dataframe includes a few hundred entries when bikes were taken out of docks and checked for quality by Divvy or ride_length was negative or ride_length bigger than 7 days ( all_trips$ride_length> 7*24*60 )
# We will create a new version of the dataframe (v2) since data is being removed
# Caution: if the column allow NA then be careful to write logic expression if NA being excluded is not what you want. You may need to add is.na(....) expression to address this issue.
all_trips_v2 <- all_trips %>%
filter( ( all_trips$start_station_name != "HQ QR" | is.na(all_trips$start_station_name) ) & all_trips$ride_length >= 0 )
# Following method has many NA rows 10948434 , contain 94609 NA rows and if start_station_name is NA that row is excluded
#all_trips_v3 <- all_trips[ !(all_trips$start_station_name == "HQ QR" | all_trips$ride_length < 0),] %>%
# filter(if_any(everything(), ~ !is.na(.)))
removed_data <- all_trips %>%
filter( all_trips$start_station_name == "HQ QR" | all_trips$ride_length < 0 )
print(paste("all_trips:", nrow(all_trips), " - all_trips_v2:", nrow(all_trips_v2), " - removed_data:", nrow(removed_data), " = ", nrow(all_trips) - nrow(all_trips_v2) - nrow(removed_data) ) )
## [1] "all_trips: 14791783 - all_trips_v2: 14777437 - removed_data: 14346 = 0"
summary(all_trips_v2$ride_length)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00e+00 6.93e+00 1.20e+01 2.28e+01 2.14e+01 2.39e+05
#mean(all_trips_v2$ride_length) #straight average (total ride length / rides)
#median(all_trips_v2$ride_length) #midpoint number in the ascending array of ride lengths
#max(all_trips_v2$ride_length) #longest ride
#min(all_trips_v2$ride_length) #shortest ride
#aggregate(all_trips_v2$ride_length ~ all_trips_v2$member_casual, FUN = mean)
#aggregate(all_trips_v2$ride_length ~ all_trips_v2$member_casual, FUN = median)
#aggregate(all_trips_v2$ride_length ~ all_trips_v2$member_casual, FUN = max)
#aggregate(all_trips_v2$ride_length ~ all_trips_v2$member_casual, FUN = min)
# Notice that the days of the week are out of order. Let's fix that.
all_trips_v2$day_of_week <- ordered(all_trips_v2$day_of_week, levels=c("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"))
aggregate(all_trips_v2$ride_length ~ all_trips_v2$member_casual + all_trips_v2$day_of_week, FUN = mean)
## all_trips_v2$member_casual all_trips_v2$day_of_week all_trips_v2$ride_length
## 1 casual Sunday 51.39386
## 2 member Sunday 15.70592
## 3 casual Monday 47.30135
## 4 member Monday 13.42959
## 5 casual Tuesday 47.39071
## 6 member Tuesday 13.36297
## 7 casual Wednesday 47.33873
## 8 member Wednesday 13.43516
## 9 casual Thursday 48.97422
## 10 member Thursday 13.48475
## 11 casual Friday 49.52023
## 12 member Friday 13.62657
## 13 casual Saturday 47.94270
## 14 member Saturday 15.91148
all_trips_v2 %>%
mutate(weekday = wday(started_at, label = TRUE)) %>% #creates weekday field using wday()
group_by(member_casual, weekday) %>% #groups by usertype and weekday
summarise(number_of_rides = n() #calculates the number of rides and average duration
,average_duration = mean(ride_length)) %>% # calculates the average duration
arrange(member_casual, weekday)
## # A tibble: 14 x 4
## # Groups: member_casual [2]
## member_casual weekday number_of_rides average_duration
## <chr> <ord> <int> <dbl>
## 1 casual Sun 772525 51.4
## 2 casual Mon 428941 47.3
## 3 casual Tue 367493 47.4
## 4 casual Wed 374538 47.3
## 5 casual Thu 402292 49.0
## 6 casual Fri 501375 49.5
## 7 casual Sat 907052 47.9
## 8 member Sun 1077993 15.7
## 9 member Mon 1670383 13.4
## 10 member Tue 1806397 13.4
## 11 member Wed 1808689 13.4
## 12 member Thu 1788179 13.5
## 13 member Fri 1682923 13.6
## 14 member Sat 1188657 15.9
all_trips_v2 %>%
mutate(weekday = wday(started_at, label = TRUE)) %>%
group_by(member_casual, weekday) %>%
summarise(number_of_rides = n()
,average_duration = mean(ride_length)) %>%
arrange(member_casual, weekday) %>%
ggplot(aes(x = weekday, y = number_of_rides, fill = member_casual)) +
geom_col(position = "dodge") +
#scale_fill_manual(values=alpha(c("#00A2FF","#0076BA"), .91)) +
labs(title = "Number of rides by day of week" , fill=" customer") +
xlab("day of week") + ylab("number of rides")
all_trips_v2 %>%
mutate(weekday = wday(started_at, label = TRUE)) %>%
group_by(member_casual, weekday) %>%
summarise(number_of_rides = n()
,average_duration = mean(ride_length)) %>%
arrange(member_casual, weekday) %>%
ggplot(aes(x = weekday, y = average_duration, fill = member_casual)) +
geom_col(position = "dodge") +
#scale_fill_manual(values=alpha(c("#00A2FF","#0076BA"), 1)) +
labs(title = "Average duration by day of week" , fill=" customer") +
xlab("day of week") + ylab("average duration")
Summarize ride_length by time year, month, day, weekday, hour, member_casual and verify 2018-11 data with original data file
ride_length_stat_2017_2020 <- all_trips_v2 %>%
mutate(weekday = wday(started_at, label = TRUE)) %>%
group_by(year, month, day, weekday, hour, member_casual) %>%
summarise(number_of_rides = n()
,average_duration = round( mean(ride_length) , digits = 1)
,min_duration = round( min(ride_length), digits = 1)
,max_duration = round( max(ride_length), digits = 1)
,sum_duration = round( sum(ride_length), digits = 1)
) %>%
arrange(year, month, day, weekday, hour, member_casual)
#str(ride_length_stat_2017_2020)
#colnames(ride_length_stat_2017_2020)
# Verify data based on 2020-06
ride_length_stat_2017_2020 %>%
filter(year == 2018 , month == 11) %>%
group_by(member_casual) %>%
summarise( number_of_rides = sum( number_of_rides), average_ride_length = mean(average_duration ))
## # A tibble: 2 x 3
## member_casual number_of_rides average_ride_length
## <chr> <int> <dbl>
## 1 casual 11006 49.7
## 2 member 157761 13.5
all_trips_v2 %>%
filter(year == 2018 , month == 11) %>%
group_by(member_casual) %>%
summarise( number_of_rides = n(),average_ride_length = mean(ride_length))
## # A tibble: 2 x 3
## member_casual number_of_rides average_ride_length
## <chr> <int> <dbl>
## 1 casual 11006 59.8
## 2 member 157761 13.3
ride_length_stat_2017_2020 %>%
mutate(year_month = as.Date(paste(year,"-",month,"-1",sep="")) ) %>%
group_by(member_casual, year_month) %>%
summarise(number_of_rides = sum( number_of_rides )
,average_duration = sum(sum_duration) / sum(number_of_rides) ) %>%
arrange(member_casual, year_month) %>%
ggplot(aes(x = year_month, y = average_duration, group = member_casual, color = member_casual)) +
geom_line( position = position_dodge(width = 0.9) ) +
geom_point() +
# scale_color_manual(values=alpha(c("#00A2FF","#0076BA"), 1)) +
labs(title = "Year 2017 ~ 2020 Average Ride Duration" , fill=" customer type" ) +
xlab("Month") + ylab("Average Ride Duration") +
scale_x_date( labels = date_format("%Y-%m"), breaks = "1 month", minor_breaks ="1 month") + # custom x-axis labels
theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position="bottom")
trip_2018_q1$ride_length <- difftime(trip_2018_q1$ended_at,trip_2018_q1$started_at, units = "mins")
trip_2018_q1 %>%
filter(month(started_at) == 1) %>%
group_by(month(started_at), member_casual) %>%
summarise( number_of_rides = n(),average_ride_length = mean(ride_length ))
## # A tibble: 2 x 4
## # Groups: month(started_at) [1]
## `month(started_at)` member_casual number_of_rides average_ride_length
## <dbl> <chr> <int> <drtn>
## 1 1 Customer 3490 196.26121 mins
## 2 1 Subscriber 106216 12.54422 mins
ride_length_stat_2017_2020 %>%
mutate(year_month = as.Date(paste(year,"-",month,"-1",sep="")) ) %>%
group_by(member_casual, year_month) %>%
summarise(number_of_rides = sum( number_of_rides ) ) %>%
arrange(member_casual, year_month) %>%
ggplot(aes(x = year_month, y = number_of_rides, group = member_casual, color = member_casual)) +
geom_line( position = position_dodge(width = 0.9) ) +
geom_point() +
# scale_color_manual(values=alpha(c("#00A2FF","#0076BA"), 1)) +
labs(title = "Year 2017 ~ 2020 Number Of Rides" , fill=" customer type") +
xlab("Month") + ylab("Number of Rides") +
scale_x_date( labels = date_format("%Y-%m"), breaks = "1 month", minor_breaks ="1 month") + # custom x-axis labels
theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position="bottom")
Summary bike stations from 2017 to 2020 , exclude latitude and longitude to prevent from same station with multiple un-precise lat. lng.
station_stat_2017_2020 <- all_trips_v2 %>%
select(year, month, start_station_id, start_station_name) %>%
filter( !is.na(start_station_id)) %>%
distinct() %>%
arrange(year, month, start_station_id, start_station_name)
# Create line chart for number of stations between 2017 ~ 2020
station_stat_2017_2020 %>%
mutate(year_month = as.Date(paste(year,"-",month,"-1",sep="")) ) %>%
group_by(year_month) %>%
summarise(number_of_station = n() ) %>%
arrange(year_month) %>%
ggplot(aes(x = year_month, y = number_of_station , color = "#00A2FF") ) +
geom_line( position = position_dodge(width = 0.9) ) +
geom_point() +
scale_color_manual(values=alpha(c("#00A2FF","#0076BA"), 1)) +
labs(title = "Year 2017 ~ 2020 Number Of Stations" , subtitle = "Statistics is based on ride trip records, some stations will not be counted if that station had not ride record in correspond month.", fill=" customer type") +
xlab("Month") + ylab("Number of Stations") +
scale_x_date( labels = date_format("%Y-%m"), breaks = "1 month", minor_breaks ="1 month") + # custom x-axis labels
theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position="none") +
geom_text( aes(label = format(number_of_station, nsmall=0, big.mark=",") ), size=3, vjust = - 2, check_overlap = TRUE, color="black") # + geom_label()
# Create line chart for number of rides between 2017 ~ 2020
ride_length_stat_2017_2020 %>%
mutate(year_month = as.Date(paste(year,"-",month,"-1",sep="")) ) %>%
group_by(year_month) %>%
summarise(sum_of_rides = sum(number_of_rides) ) %>%
arrange(year_month) %>%
ggplot(aes(x = year_month, y = sum_of_rides , color = "#00A2FF") ) +
geom_line( position = position_dodge(width = 0.9) ) +
geom_point() +
scale_color_manual(values=alpha(c("#00A2FF","#0076BA"), 1)) +
labs(title = "Year 2017 ~ 2020 Number Of Rides", fill=" customer type") +
xlab("Month") + ylab("Number of Rides") +
scale_x_date( labels = date_format("%Y-%m"), breaks = "1 month", minor_breaks ="1 month") + # custom x-axis labels
theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position="none") +
geom_text( aes(label = format(sum_of_rides, nsmall=0, big.mark=",") ), size=3, vjust = -1, check_overlap = TRUE, color = "black") # + geom_label()
# Create line chart for accumulated ride length from 2017 to 2020
ride_length_stat_2017_2020 %>%
mutate(year_month = as.Date(paste(year,"-",month,"-1",sep="")) ) %>%
group_by(year_month) %>%
summarise(sum_of_ride_length = sum(sum_duration) ) %>%
arrange(year_month) %>%
ggplot(aes(x = year_month, y = sum_of_ride_length , color = "#00A2FF") ) +
geom_line( position = position_dodge(width = 0.9) ) +
geom_point() +
scale_color_manual(values=alpha(c("#00A2FF","#0076BA"), 1)) +
labs(title = "Year 2017 ~ 2020 Accumulated Ride Length", fill=" customer type") +
xlab("Month") + ylab("Accumulated Ride Length") +
scale_x_date( labels = date_format("%Y-%m"), breaks = "1 month", minor_breaks ="1 month") + # custom x-axis labels
theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position="none") +
geom_text( aes(label = format(sum_of_ride_length, nsmall=0, big.mark=",") ), size=3, vjust = -1, check_overlap = TRUE, color = "black") # + geom_label()
# Create multiple-line chart for accumulated ride length from 2017 to 2020
ride_length_stat_2017_2020 %>%
mutate(year_month = as.Date(paste(year,"-",month,"-1",sep="")) ) %>%
group_by(year_month) %>%
summarise(sum_of_ride_length = sum(sum_duration) ) %>%
arrange(year_month) %>%
ggplot(aes(x = month(year_month,label=TRUE, abbr=TRUE), y = sum_of_ride_length , group=factor(year(year_month)), colour = factor(year(year_month))) ) +
geom_line() +
geom_point() +
#scale_color_manual(values=alpha(c("#00A2FF","#0076BA"), 1)) +
labs(title = "Year 2017 ~ 2020 Accumulated Ride Length", colour="Year") +
xlab("Month") + ylab("Accumulated Ride Length") +
#scale_x_date( labels = date_format("%Y-%m"), breaks = "1 month", minor_breaks ="1 month") + # custom x-axis labels
theme(axis.text.x = element_text(angle = 45), legend.position="right") +
geom_text( aes(label = format(sum_of_ride_length, nsmall=0, big.mark=",") ), size=3, vjust = -1, check_overlap = TRUE, color = "black") # + geom_label()
station_list_after_2020 <- all_trips %>%
filter( year(started_at) >= 2020 & !is.na(start_station_id) & !is.na(start_lat) & !is.na(start_lng) ) %>%
group_by(start_station_id) %>%
summarise(
start_station_name = max(start_station_name),
start_lat = max(start_lat),
start_lng = max(start_lng)
) %>%
select(start_station_id, start_station_name, start_lat, start_lng) %>%
distinct( )
station_stat_2020_Q1 <- all_trips_v2 %>%
mutate(weekday = wday(started_at, label = TRUE)) %>%
filter( year == 2020 & month <= 3 ) %>%
group_by(year, month, day, weekday, hour, start_station_id, start_station_name, start_lat, start_lng, member_casual) %>%
summarise(number_of_rides = n()
,average_duration = round( mean(ride_length) , digits = 1)
,min_duration = round( min(ride_length), digits = 1)
,max_duration = round( max(ride_length), digits = 1)
,sum_duration = round( sum(ride_length), digits = 1)
) %>%
arrange(year, month, day, weekday, hour, start_station_id, start_station_name, start_lat, start_lng, member_casual, member_casual)
# Bike station list with lat. lng. data. If there multiple lat. lng. data in same station id , choose first 1
station_list_after_2020 <- all_trips %>%
filter( year(started_at) >= 2020 & !is.na(start_station_id) & !is.na(start_lat) & !is.na(start_lng) ) %>%
group_by(start_station_id) %>%
summarise(
start_station_name = max(start_station_name),
start_lat = max(start_lat),
start_lng = max(start_lng)
) %>%
select(start_station_id, start_station_name, start_lat, start_lng) %>%
distinct( )
# 2020 Q1 bike stations and its lat. lng. list
stations_2020_Q1 <- station_stat_2020_Q1 %>% ungroup() %>%
select(start_station_id, start_station_name, start_lat, start_lng) %>%
distinct( )
# Bike stations map =====
# Get map box range
bb = qbbox(stations_2020_Q1$start_lat, stations_2020_Q1$start_lng)
map.box <- c(left = bb$lonR[1]-0.1 , bottom = bb$latR[1], right = bb$lonR[2]+0.1, top = bb$latR[2])
# Get map by call get_map , this function require your Google Map API Key, I comment out this line and replace by pre-created map image in following content. You can re-run this command if you have got API key and already call register_google() funciton in top "Set Environment" section.
# If you have Google Map API key and already call register_google() funciton in top "Set Environment" section, you can run following 2 code line to gen new stattion map
#station.map <- get_map(map.box, zoom = 13 , maptype ="hybrid") # or roadmap
#save(station.map, file = "../../img/cyclistic_bike_stationp.RData")
# Or you can just run this line to restore "station.map" by loading from pre-saved RData file
load(file = "../../img/cyclistic_bike_stationp.RData")
Hotspots of Members’ ride
Hotspots of Casual Riders’ ride